# R Options
options(stringsAsFactors=FALSE,
"citation_format"="pandoc",
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
future.globals.maxSize=2000000000, mc.cores=4, future.fork.enable=TRUE, future.plan="multicore")
# Python needed for clustering, umap, other python packages
# Change location if necessary
Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python")
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix
# Tables: kableExtra
# Plots: ggsci, ggpubr
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, cache=FALSE, message=TRUE, warning=TRUE, tidy=FALSE, fig.width=10, class.source='fold-hide')
# biomaRt mirror: NULL means the first working mirror is used
mirror = NULLThis code chunk contains all parameters that are set specifically for the project.
param = list()
# Project ID
param$project_id = "pbmc"
# Input data path in case Cell Ranger was run
param$path_data = data.frame(name=c("pbmc_10x","pbmc_smartseq2"),
type=c("10x","smartseq2"),
path=c("test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/","test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz"))
param$file_mapping_stats = NULL
# Project-specific paths
param$path_out = "test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/"
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
# Marker genes based on literature, translated to Ensembl IDs
# xlsx file, one list per column, first row as header and Ensembl IDs below
# Set to NULL if no known marker genes should be plotted
param$file_known_markers = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/known_markers.xlsx"
# Annotation via biomaRt
param$mart_dataset = "hsapiens_gene_ensembl"
param$annot_version = 98
param$annot_main = c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="external_gene_name")
param$file_annot = NULL
if (is.null(param$file_annot)) {
param$file_annot = file.path(param$path_out, paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
}
param$mart_attributes = c(param$annot_main,
c("chromosome_name", "start_position", "end_position",
"percentage_gene_gc_content", "gene_biotype", "strand", "description"))
# Prefix of mitochondrial genes
param$mt = "^MT-"
# Filters
param$cell_filter = list(nFeature_RNA=c(200,NA), percent_mt=c(NA,20))
param$feature_filter = list(min_counts=1, min_cells=3) # feature has to be found by at least one count in one cell
param$samples_to_drop = c("NC", "RNA") # cells from these samples will be dropped after initial QC
# Whether or not to remove cell cycle effects
param$cc_remove = FALSE
# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param$cc_remove_all = FALSE
# Additional (unwanted) variables that will be regressed out for visualisation and clustering
param$vars_to_regress = c()
# Additional (unwanted) variables to be included in the statistical tests
param$latent_vars = c()
# When there are multiple datasets, how to integrate them:
# - method:
# - merge: Just merge them since no integration is needed (e.g. samples were multiplexed on the same chip)
# - standard: Anchors are computed for all pairs of datasets. This will give all datasets the same weight during dataset integration but can be computationally intensive.
# - reference: One dataset is used as reference and anchors are computed for all other datasets. Less accurate but computationally faster.
# - reciprocal: Anchors are computed in PCA space instead of the data. Even less accurate but for very big datasets.
#
# - reference_dataset: When using method "reference", which dataset is the reference? Can be numeric or name of the dataset.
# - dimensions: Number of dimensions to consider for integration
param$integrate_samples = list(method="standard", reference_dataset=1, dimensions=30)
# Which normalisation should be used for analysis: RNA or SCT?
param$normalisation_default = "SCT"
# The number of PCs to use; adjust this parameter based on the Elbowplot
param$pc_n = 10
# Resolution of clusters; low values will lead to fewer clusters of cells
param$cluster_resolution=0.5
# Thresholds to define differentially expressed genes
param$padj = 0.05
param$log2fc = log2(1.5)
# P-value threshold for functional enrichment
param$p_enrichr = 0.05
# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")
# Main colour(s) to use for plots
param$col = "palevioletred"
# Colour palette and colours used for samples
param$col_palette_samples = "ggsci::pal_jama"
# Colour palette and colours used for cluster
param$col_palette_clusters = "ggsci::pal_startrek"
# Sample data to at most n cells per dataset/sample (mainly for tests); set to NULL to deactivate
param$sample_cells = 500
# Path to git repository
param$path_to_git = "."We begin by printing mapping statistics that have been produced prior to this workflow.
# Are statistics provided?
if (!all(is.na(param$path_data$stats))) {
# Loop through all samples and read mapping stats
mapping_stats_list = list()
for (i in 1:nrow(param$path_data)) {
if (!is.na(param$path_data$stats[i])) {
mapping_stats_list[[param$path_data$name[i]]] = read.delim(param$path_data$stats[i],
sep=",", header=FALSE, check.names=FALSE) %>%
t() %>% as.data.frame()
}
}
# Join all mapping stats tables
mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="V1")
rownames(mapping_stats) = mapping_stats[["V1"]]
mapping_stats = mapping_stats %>% dplyr::select(-V1)
colnames(mapping_stats) = names(mapping_stats_list)
# Print table to HTML
knitr::kable(mapping_stats, align="l", caption="Mapping statistics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}× (Message) Mapping statistics cannot be shown. No valid file provided.
If not provided already, we read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat rownames.
# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
annot_ensembl = read.delim(param$file_annot)
} else {
annot_mart = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=mirror,
version=param$annot_version))
annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes)
write.table(annot_ensembl, file=param$file_annot, sep='\t', col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]
# Ensembl id to gene symbol
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])
# Ensembl id to seurat-compatible unique rowname
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])
# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
# Gene symbol to ensembl id: named LIST to account for genes where one symbol translates to multiple Ensembl IDs
symbol_to_ensembl_df = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_ensembl = split(symbol_to_ensembl_df[, ensembl], symbol_to_ensembl_df[, symbol])
# Gene symbol to (seurat compatible unique) gene symbol: named LIST to account for genes with multiple names
symbol_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_seurat_rowname$seurat_rowname = ensembl_to_seurat_rowname[symbol_to_seurat_rowname[, ensembl]]
symbol_to_seurat_rowname = split(symbol_to_seurat_rowname$seurat_rowname, symbol_to_seurat_rowname[, symbol])
# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])
# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})
# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
cc_genes_marker_file = paste0(param$path_out, "/cell_cycle_markers.xlsx")
if (file.exists(cc_genes_marker_file)) {
# Load from file
genes_s = openxlsx::read.xlsx(cc_genes_marker_file, sheet=1)
genes_g2m = openxlsx::read.xlsx(cc_genes_marker_file, sheet=2)
} else {
# Obtain from Ensembl
# Note: both mart objects must point to the same mirror for biomarT::getLDS to work
mart_human = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset="hsapiens_gene_ensembl",
mirror=mirror,
version=param$annot_version))
mart_myspecies = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=GetBiomaRtMirror(mart_human),
version=param$annot_version))
# S phase marker
genes_s = biomaRt::getLDS(attributes=c("ensembl_gene_id","external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id","external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_s) = c("Human_ensembl_id","Human_gene_name","Species_ensembl_id","Species_gene_name")
# G2/M marker
genes_g2m = biomaRt::getLDS(attributes=c("ensembl_gene_id","external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id","external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_g2m) = c("Human_ensembl_id","Human_gene_name","Species_ensembl_id","Species_gene_name")
# Write to file
openxlsx::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m),file=cc_genes_marker_file)
}
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))We next read the scRNA-seq dataset(s) into Seurat.
# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i,"name"]
type = datasets[i,"type"]
path = datasets[i,"path"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc[[name]] = ReadSparseMatrix(path,
project=name,
row_name_column=1,
convert_row_names=ensembl_to_seurat_rowname)
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE))
}
}
# Make cell names unique
sc = purrr::map(list_indices(sc), function(i){
cell_names = gsub("-\\d+","",colnames(sc[[i]]))
Seurat::RenameCells(sc[[i]], new.names=paste(cell_names,i,sep="-"))
})
# Set up colors for samples
sample_names = purrr::flatten_chr(purrr::map(sc, function(s){ unique(as.character(s[[]][["orig.ident"]])) }))
param$col_samples = GenerateColours(num_colours=length(sample_names), palette=param$col_palette_samples)
names(param$col_samples) = sample_names
# Sample cells if requested
if (!is.null(param$sample_cells)) {
sc = purrr::map(sc, function(s) {
cells = colnames(s)
return(subset(s, cells=sample(cells, min(param$sample_cells, length(cells)))))
})
}We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).
The following first table shows metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), or the above mentioned number of unique genes detected (“nFeature”). The second table shows metadata (columns) of the first 5 genes (rows), for example the number of cells with at least 1 count for the gene (“num_cells_expr”), and the number of cells with at least as many counts as set in the parameter filter section (“num_cells_expr_threshold”).
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, Seurat::PercentageFeatureSet, pattern=param$mt, col.name="percent_mt", assay="RNA")
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
# Print table
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| orig.ident | nCount_RNA | nFeature_RNA | percent_mt | SampleName | PlateNumber | PlateRow | PlateCol |
|---|---|---|---|---|---|---|---|
| pbmc_10x | 278 | 121 | 26.61870 | NA | NA | NA | NA |
| pbmc_10x | 4730 | 1615 | 10.80338 | NA | NA | NA | NA |
| pbmc_10x | 775 | 256 | 12.51613 | NA | NA | NA | NA |
| pbmc_10x | 7525 | 2558 | 11.41528 | NA | NA | NA | NA |
| pbmc_10x | 4672 | 1560 | 11.77226 | NA | NA | NA | NA |
| pbmc_10x | 230 | 91 | 31.30435 | NA | NA | NA | NA |
# Only RNA assay at the moment
sc = purrr::map(list_names(sc), function(n) {
num_cells_expr = Matrix::rowSums(Seurat::GetAssayData(sc[[n]], slot="counts", assay="RNA") >= 1)
num_cells_expr_threshold = Matrix::rowSums(Seurat::GetAssayData(sc[[n]], slot="counts", assay="RNA") >= param$feature_filter[[n]][["min_counts"]])
sc[[n]][["RNA"]] = Seurat::AddMetaData(sc[[n]][["RNA"]], data.frame(num_cells_expr,num_cells_expr_threshold))
return(sc[[n]])
})
# Print table
knitr::kable(head(sc[[1]][["RNA"]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| feature_id | feature_name | feature_type | num_cells_expr | num_cells_expr_threshold | |
|---|---|---|---|---|---|
| TSPAN6 | ENSG00000000003 | TSPAN6 | Gene Expression | 21 | 21 |
| TNMD | ENSG00000000005 | TNMD | Gene Expression | 0 | 0 |
| DPM1 | ENSG00000000419 | DPM1 | Gene Expression | 70 | 70 |
| SCYL3 | ENSG00000000457 | SCYL3 | Gene Expression | 25 | 25 |
| C1orf112 | ENSG00000000460 | C1orf112 | Gene Expression | 6 | 6 |
# Plot QC metrics for cells
cell_qc_features = c("nFeature_RNA", "nCount_RNA", "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features,"percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
p_list = list()
for (i in names(cell_qc_features)) {
p_list[[i]]= ggplot(sc_cell_metadata,aes_string(x="orig.ident", y=i, fill="orig.ident")) +
geom_violin(scale="width") +
AddStyle(title=i, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x = element_text(angle=45, hjust=1))
# Creates a table with min/max values for filter i for each dataset
cell_filter_for_plot = purrr::map_dfr(names(param$cell_filter), function(n) {
# If filter i in cell filter of the dataset, then create dataframe with columns orig.ident, threshold and value
if (i %in% names(param$cell_filter[[n]])){
data.frame(orig.ident=n, threshold=c("min", "max"), value=param$cell_filter[[n]][[i]], stringsAsFactors=FALSE)
}
})
# Add filters as segments to plot
if (nrow(cell_filter_for_plot)>0) {
# Remove entries that are NA
cell_filter_for_plot = cell_filter_for_plot %>% dplyr::filter(!is.na(value))
p_list[[i]] = p_list[[i]] + geom_segment(data=cell_filter_for_plot,
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value),
lty=2, col="firebrick")
}
}
p = patchwork::wrap_plots(p_list, ncol = 3) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p_list = list()
p_list[[1]] = ggplot(sc_cell_metadata,aes_string(x=cell_qc_features[2], y=cell_qc_features[1], colour="orig.ident")) +
geom_point() +
AddStyle(col=param$col_samples)
p_list[[2]] = ggplot(sc_cell_metadata,aes_string(x=cell_qc_features[3], y=cell_qc_features[1], colour="orig.ident")) +
geom_point() +
AddStyle(col=param$col_samples)
p = patchwork::wrap_plots(p_list, ncol = 2) + patchwork::plot_annotation("Features plotted against each other")
if (length(sc)==1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides = "collect") & theme(legend.position="bottom")
}
pThe size of the Seurat object before filtering is:
## $pbmc_10x
## An object of class Seurat
## 33694 features across 500 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## $pbmc_smartseq2.sample1
## An object of class Seurat
## 33694 features across 311 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
Cells and genes are filtered based on the following thresholds:
cell_filter_list = param$cell_filter %>% unlist(recursive=FALSE)
cell_filter_tbl = cell_filter_list %>% purrr::reduce(rbind) %>% as.data.frame()
rownames(cell_filter_tbl) = names(cell_filter_list)
colnames(cell_filter_tbl) = c("Min", "Max")
feature_filter_list = param$feature_filter %>% unlist(recursive=FALSE)
feature_filter_tbl = feature_filter_list %>% purrr::reduce(rbind) %>% as.data.frame()
rownames(feature_filter_tbl) = names(feature_filter_list)
colnames(feature_filter_tbl) = "n"
knitr::kable(cell_filter_tbl, align="l", caption="Filters applied to cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Min | Max | |
|---|---|---|
| pbmc_10x.nFeature_RNA | 200 | NA |
| pbmc_10x.percent_mt | NA | 20 |
| pbmc_smartseq2.sample1.nFeature_RNA | 200 | NA |
| pbmc_smartseq2.sample1.percent_mt | NA | 20 |
knitr::kable(feature_filter_tbl, align="l", caption="Filters applied to genes") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| n | |
|---|---|
| pbmc_10x.min_counts | 1 |
| pbmc_10x.min_cells | 3 |
| pbmc_smartseq2.sample1.min_counts | 1 |
| pbmc_smartseq2.sample1.min_cells | 3 |
The number of excluded cells and features is as follows:
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude = purrr::map(list_names(sc), function(n) {
filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter = param$cell_filter[[n]][[f]]
if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
return(names(which(idx_exclude)))
} else if (is.character(filter)) {
return(names(which(sc[[n]][[f, drop=TRUE]] %in% filter)))
}
})
# Samples to drop
cells_orig_sample = sc[[n]][["orig.ident", drop=TRUE]]
filter_result[["sample_to_drop"]] = names(cells_orig_sample[cells_orig_sample %in% param$samples_to_drop])
return(filter_result)
})
# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
return(as.data.frame(purrr::map(s,length)))
})
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
knitr::kable(sc_cells_to_exclude_summary, align="l", caption="Number of excluded cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) | nFeature_RNA | percent_mt | sample_to_drop | |
|---|---|---|---|
| pbmc_10x | 40 | 44 | 0 |
| pbmc_smartseq2.sample1 | 1 | 0 | 0 |
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
cells_to_keep = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
if (length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)
if (length(sc)==1) param$integrate_samples[["method"]] = "single"# Only RNA assay at the moment
# Iterate over datasets and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
else return(names(which(sc[[n]][["RNA"]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})
# Summarise
sc_features_to_exclude_summary = purrr::map(sc_features_to_exclude, length) %>%
t() %>% as.data.frame()
rownames(sc_features_to_exclude_summary) = c("Genes")
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Number of excluded genes") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| pbmc_10x | pbmc_smartseq2.sample1 | |
|---|---|---|
| Genes | 19387 | 15754 |
# Now filter
sc = purrr::map(list_names(sc), function(n) {
assay_names = Seurat::Assays(sc[[n]])
features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
features = rownames(sc[[n]][[a]])
keep = features[!features %in% sc_features_to_exclude[[n]]]
return(keep)
})
return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})After filtering, the size of the Seurat object is:
## $pbmc_10x
## An object of class Seurat
## 14307 features across 441 samples within 1 assay
## Active assay: RNA (14307 features, 0 variable features)
##
## $pbmc_smartseq2.sample1
## An object of class Seurat
## 17940 features across 310 samples within 1 assay
## Active assay: RNA (17940 features, 0 variable features)
In this section, we subsequently run a series of Seurat functions:
1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
2. Variable genes are selected. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly expressed in others.
3. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score). This way, genes are equally weighted for downstream analysis.
In addition, we run:
4. SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling). Note that although results are kept for steps 1-3 (“RNA” assay), downstream analyses will default to resulting data of step 4 (“SCT” assay).
Finally, cell cycle scores are assigned to each cell based on its normalised expression of G2/M and S phase markers, after basic normalisation (step 1). These scores are visualised in a separate section further below. If specified in the above parameter section, cell cycle effects are removed during scaling (step 4). Note that removing all signal associated to cell cycle can negatively impact downstream analysis. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. As an alternative, we can remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences amongst proliferating cells will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat (“Satija Lab” 2020). Cell cycle effect removed for this report: FALSE; all cell cycle effects removed for this report: FALSE.
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# Normalise data the original way
sc = purrr::map(sc, Seurat::NormalizeData, normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE)
# Select features from normalised data (unaffected by scaling)
sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method = "vst", nfeatures = 3000, verbose=FALSE)# Determine cell cycle effect
sc = purrr::map(list_names(sc), function(n) {
genes_s_exists = genes_s[, 2] %in% rownames(sc[[n]])
genes_g2m_exists = genes_g2m[, 2] %in% rownames(sc[[n]])
if (sum(genes_s_exists) >= 10 & sum(genes_s_exists) >= 10){
sc[[n]] = Seurat::CellCycleScoring(sc[[n]],
s.features=genes_s[genes_s_exists, 2],
g2m.features=genes_g2m[genes_g2m_exists,2],
set.ident=FALSE, verbose=FALSE)
sc[[n]][["CC.Difference"]] = sc[[n]][["S.Score", drop=TRUE]] - sc[[n]][["G2M.Score", drop=TRUE]]
sc[[n]][["Phase"]] = factor(sc[[n]][["Phase", drop=TRUE]], levels=c("G1","G2M","S"))
} else {
sc[[n]][["S.Score"]] = NA
sc[[n]][["G2M.Score"]] = NA
sc[n][["Phase"]] = factor(NA, levels=c("G1","G2M","S"))
param$cc_remove = FALSE
Warning(paste0("There are not enough G2/M and S phase markers in the dataset '", n, "' to reliably determine cell cycle scores and phases. Scores and phases will be set to NA and removal of cell cycle will not be done."))
}
return(sc[[n]])
})
# If cell cycle effects should be removed, we first score cells
# The effect is then removed in the following chunk
if (param$cc_remove) {
# Add to vars that need to regressed out during normalisation
if (param$cc_remove_all) {
# Remove all signal associated to cell cycle
param$vars_to_regress = unique(c(param$vars_to_regress, "S.Score", "G2M.Score"))
param$latent_vars = unique(c(param$latent_vars, "S.Score", "G2M.Score"))
} else {
# Don't remove the difference between cycling and non-cycling cells
param$vars_to_regress = unique(c(param$vars_to_regress, "CC.Difference"))
param$latent_vars = unique(c(param$latent_vars, "CC.Difference"))
}
}# Scale RNA assay
# Note: Removing cell cycle effects in "RNA" scaled data can be very slow
sc = purrr::map(sc, function(s) {
Seurat::ScaleData(s, features=VariableFeatures(s), vars.to.regress=param$vars_to_regress, verbose=FALSE)
})
# Run SCTransform
#
# This is a new normalisation method that replaces previous Seurat functions 'NormalizeData', 'FindVariableFeatures', and 'ScaleData'.
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# Normalised data end up here: sc@assays$SCT@data
sc = purrr::map(list_names(sc), function(n) { SCTransform(sc[[n]], vars.to.regress=param$vars_to_regress, min_cells=param$feature_filter[[n]][["min_cells"]], verbose=FALSE) })
# Note: It is not guaranteed that all genes are successfully normalised with SCTransform. Consequently, some genes might be missing from the SCT assay. See: https://github.com/ChristophH/sctransform/issues/27Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells.
To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in. Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability. The top 3,000 variable genes are used for further analysis.
# Show variable genes
p_list = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]], assay="RNA"), 10)
p = Seurat::VariableFeaturePlot(sc[[n]], assay="RNA", selection.method="vst", col=c("grey", param$col)) +
AddStyle(title=n) +
theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
return(p)
})
p = patchwork::wrap_plots(p_list) + patchwork::plot_annotation("Variable genes")
p# Show variable genes
p_list = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]], assay="SCT"), 10)
p = Seurat::VariableFeaturePlot(sc[[n]], assay="SCT", selection.method="sct", col=c("grey", param$col)) +
AddStyle(title=n) +
theme(legend.position=c(0.2, 0.8),legend.background=element_rect(fill=alpha("white", 0.0)))
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
return(p)
})
p = patchwork::wrap_plots(p_list) + patchwork::plot_annotation("Variable genes")
pn_cells_rle_plot = 100
# Sample at most 100 cells per dataset and save their identity; also get list of features shared among datasets
cells_RLE_subset = purrr::map(sc, function(s) {
cells = Seurat::Cells(s)
cell_subset = sample(cells, min(n_cells_rle_plot, length(cells)))
cell_subset = sort(setNames(as.character(s[[]][cell_subset, "orig.ident", drop=TRUE]),cell_subset))
return(cell_subset)
})
cells_orig_sample = purrr::flatten_chr(cells_RLE_subset)
cells_RLE_subset = purrr::map(cells_RLE_subset, names)
shared_rna_features = purrr::reduce(purrr::map(sc, function(s) rownames(s[["RNA"]])), intersect)
shared_sct_features = purrr::reduce(purrr::map(sc, function(s) rownames(s[["SCT"]])), intersect)To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.
For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#374E55FF, #DF8F44FF).
# Get counts from assay RNA
# Note: The counts are retrieved as sparse matrices and then cbind is used to combine them. Only at the plotRLE,
cell_subset_raw = purrr::map(seq(length(sc)), function(i) {
d = GetAssayData(subset(sc[[i]], cells=cells_RLE_subset[[i]], features=shared_rna_features), assay="RNA", slot="counts")
return(d)
})
cell_subset_raw = purrr::reduce(cell_subset_raw, cbind)
# Plot
p = PlotRLE(as.matrix(log2(cell_subset_raw + 1)), id=cells_orig_sample, col=param$col_samples) + labs(title="log2(raw counts + 1)")
p# Get normalised data from assay RNA
# Note: The counts are retrieved as sparse matrices and then cbind is used to combine them
cell_subset_norm1 = purrr::map(seq(length(sc)), function(i) {
d = GetAssayData(subset(sc[[i]], cells=cells_RLE_subset[[i]], features=shared_rna_features), assay="RNA", slot="data")
return(d)
})
cell_subset_norm1 = purrr::reduce(cell_subset_norm1, cbind)
# Plot
p = PlotRLE(as.matrix(cell_subset_norm1), id=cells_orig_sample, col=param$col_samples) + labs(title="Normalised data")
p # Get normalised data from assay SCT
# Note: The counts are retrieved as sparse matrices and then cbind is used to combine them
cell_subset_norm2 = purrr::map(seq(length(sc)), function(i) {
d = GetAssayData(subset(sc[[i]], cells=cells_RLE_subset[[i]], features=shared_sct_features), assay="SCT", slot="data")
return(d)
})
cell_subset_norm2 = purrr::reduce(cell_subset_norm2, cbind)
# Plot
p = PlotRLE(as.matrix(cell_subset_norm2), id=cells_orig_sample, col=param$col_samples) + labs(title="SCTransform'ed data")
p if (param$integrate_samples[["method"]]!="single") {
# Markdown text for this section (do not change intendation)
cat("## Integration of multiple datasets\n\n")
# Feature metadata is removed by Seurat merge entirely; save separately for each assay and add again afterwards
assay_names = unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } )))
# Loop through all assays and accumulate meta data
feature_data_for_assay = purrr::map(values_to_names(assay_names), function(a) {
# "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
# This step is skipped for assays that do not contain all three types of feature information
contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) { all(c("feature_id","feature_name","feature_type") %in% colnames(sc[[n]][[a]][[]])) })
if (all(contains_neccessary_columns)) {
feature_id_name_type = purrr::map(sc, function(s) { return(s[[a]][[c("feature_id", "feature_name", "feature_type")]]) })
feature_id_name_type = purrr::reduce(feature_id_name_type, function(df_x, df_y) {
new_rows = which(!rownames(df_y) %in% rownames(df_x))
if (length(new_rows)>0) return(rbind(df_x, df_y[new_rows,]))
else return(df_x)
})
feature_id_name_type$row_names = rownames(feature_id_name_type)
} else {
feature_id_name_type = NULL
}
# For all other metadata, we prefix column names with the dataset
other_feature_data = purrr::map(list_names(sc), function(n) {
df = sc[[n]][[a]][[]]
if (contains_neccessary_columns[[n]]) df = df %>% dplyr::select(-dplyr::one_of(c("feature_id","feature_name","feature_type"),c()))
if (ncol(df)>0) colnames(df) = paste(n, colnames(df), sep=".")
df$row_names = rownames(df)
return(df)
})
# Now join everything by row_names by full outer join
if (!is.null(feature_id_name_type)) {
feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type),other_feature_data), dplyr::full_join, by="row_names")
} else {
feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
}
rownames(feature_data) = feature_data$row_names
feature_data$row_names = NULL
return(feature_data)
})
}# Standard method for integrating multiple samples. Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="standard") {
# Note "Assay names should only have numbers and letters: Warnung: Keys should be one or more alphanumeric characters followed by an underscore, setting key from rna_integrated_ to rnaintegrated_" (seurat/R/object.R)
# The integration step will temporarily occupy lots of memory. However, R has problems with freeing unused memory.
# By wrapping the steps into a function, hopefully this works a bit better.
run_standard_integration = function(sc_objs, ndims=30, vars_to_regress=c(), verbose=FALSE) {
# Find integration anchors for assay RNA
sc_objs = purrr::map(sc_objs, function(s){ Seurat::DefaultAssay(s) = "RNA"; return(s)})
integrate_RNA_anchors = Seurat::FindIntegrationAnchors(object.list=sc_objs, dims=1:ndims, verbose=verbose)
tmp_sc_objs = Seurat::IntegrateData(integrate_RNA_anchors, new.assay.name="RNAintegrated", dims=1:ndims, verbose=verbose)
# scale data: according to seurat, this is needed only for the integrated RNA assay
tmp_sc_objs = Seurat::ScaleData(tmp_sc_objs, features=rownames(tmp_sc_objs[["RNAintegrated"]]), verbose=verbose, vars.to.regress=vars_to_regress, assay="RNAintegrated")
sc_obj_RNAintegrated_assay = Seurat::GetAssay(tmp_sc_objs, assay="RNAintegrated")
rm(tmp_sc_objs, integrate_RNA_anchors)
# Find integration anchors for assay SCT
sc_objs = purrr::map(sc_objs, function(s){ Seurat::DefaultAssay(s) = "SCT"; return(s)})
integrate_SCT_features = SelectIntegrationFeatures(object.list=sc_objs, verbose=verbose)
sc_objs = PrepSCTIntegration(object.list=sc_objs, anchor.features=integrate_SCT_features, verbose=verbose, assay=rep("SCT",length(sc_objs)))
integrate_SCT_anchors = FindIntegrationAnchors(object.list=sc_objs, normalization.method="SCT", anchor.features=integrate_SCT_features, verbose=verbose)
sc_objs = Seurat::IntegrateData(integrate_SCT_anchors, new.assay.name="SCTintegrated", normalization.method="SCT", dims=1:ndims, verbose=verbose)
sc_objs[["RNAintegrated"]] = sc_obj_RNAintegrated_assay
rm(integrate_SCT_anchors, integrate_SCT_features, sc_obj_RNAintegrated_assay)
# Call garbage collector to free memory (hope it helps)
gc(verbose=verbose)
return(sc_objs)
}
# call function
sc = run_standard_integration(sc, ndims=param$integrate_samples[["dimensions"]], vars_to_regress=param$vars_to_regress)
# Scale RNA assay
# Note: Removing cell cycle effects in "RNA" scaled data can be very slow
DefaultAssay(sc) = "RNA"
sc = Seurat::ScaleData(sc, features=rownames(sc[["RNA"]]), vars.to.regress=param$vars_to_regress, verbose=FALSE, assay="RNA")
# Add feature metadata
for (a in Seurat::Assays(sc)) {
if (a %in% names(feature_data_for_assay)) {
sc[[a]] = Seurat::AddMetaData(sc[[a]], feature_data_for_assay[[a]][rownames(sc[[a]]),, drop=FALSE])
}
}
# Set default assay (will be the integrated version)
DefaultAssay(sc) = paste0(param$normalisation_default,"integrated")
}if (param$integrate_samples[["method"]]!="single" & param$integrate_samples[["method"]]!="merge") {
# Markdown text for this section (do not change intendation)
cat("\n\n### Relative log expression after integration {.tabset}\n\n")
cat("#### Standard normalisation (RNA)\n\n")
# Get normalised data from assay RNAintegrated and plot RLE
cell_subset_norm1 = GetAssayData(subset(sc, cells=purrr::flatten_chr(cells_RLE_subset), features=shared_rna_features), assay="RNAintegrated", slot="data")
p = PlotRLE(as.matrix(cell_subset_norm1), id=cells_orig_sample, col=param$col_samples) + labs(title="Normalised integrated data")
print(p)
}if (param$integrate_samples[["method"]]!="single" & param$integrate_samples[["method"]]!="merge") {
# Markdown text for this section (do not change intendation)
cat("\n\n#### Advanced normalisation (SCT)\n\n")
# Get normalised data from assay SCTintegrated and plot RLE
cell_subset_norm2 = GetAssayData(subset(sc, cells=purrr::flatten_chr(cells_RLE_subset), features=shared_sct_features), assay="SCTintegrated", slot="data")
p = PlotRLE(as.matrix(cell_subset_norm2), id=cells_orig_sample, col=param$col_samples) + labs(title="SCTransform'ed integrated data")
print(p)
}A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.
We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.
To decide how many PCs to include in downstream analyses, we visualize cells and genes that define the PCA.
# Run PCA for default normalisation
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE)
p_list = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p = patchwork::wrap_plots(p_list, ncol = 2) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pp = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples) +
AddStyle(title="Cells arranged by the first two PCs", legend_position="bottom")
pWe next need to decide how many PCs we want to use for our analyses. The following “Elbow plot” is designed to help us make an informed decision. PCs are ranked based on the percentage of variance they explain.
For the current dataset, 10 PCs were chosen.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=20) + AddStyle(title="Elbow plot")
pSeurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm.
# Note: I changed the seed in ./lib/python3.6/site-packages/leidenalg/functions.py to 11 for reproducibility
# The number of clusters can be optimized by tuning 'resolution' -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE)
# Cluster using the Leiden algorithm
# Paper to Leiden algorithm: https://www.nature.com/articles/s41598-019-41695-z
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
# Taken from documentation:
# Method for running leiden (defaults to matrix which is fast for small datasets). Enable method = "igraph" to avoid casting large data to a dense matrix.
# Note Katrin:
# For method="igraph", we need to set weights=TRUE
# This is implemented as default recently and will be updated with a new Seurat release
# https://github.com/satijalab/seurat/pull/3233
sc = Seurat::FindClusters(sc, resolution=param$cluster_resolution, algorithm=4, verbose=FALSE, method="igraph", weights=TRUE)
# Construct phylogenetic tree relating the 'average' cell from each cluster
sc = BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE)
# Set up colors for clusters
cluster_names = levels(sc$seurat_clusters)
param$col_clusters = GenerateColours(num_colours=length(cluster_names), palette=param$col_palette_clusters)
names(param$col_clusters) = cluster_namesWe use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see (“Understanding Umap” 2019).
# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot"))
# 3D UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, n.components=3, reduction.name="umap3d", reduction.key="UMAP3D_", verbose=FALSE, umap.method="uwot"))
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", label=TRUE) +
AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters") +
scale_color_manual(values=param$col_clusters, labels=cluster_labels)
p# Add a UMAP that is coloured by sample of origin
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% unique() %>% sort()
if (length(cell_samples) > 1) {
# Note: This is a hack to colour by sample but label by Cluster
p = Seurat::DimPlot(sc, group.by="orig.ident", pt.size=1, cols=param$col_samples) +
AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data),]
p = LabelClusters(p, id="seurat_clusters")
print(p)
}The following table shows the number of cells per sample per cluster:
In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher than expected:
# Count cells per cluster per sample
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% unique() %>% sort()
tbl = lapply(cell_samples, function(i) {
tmp = sc[[]] %>% dplyr::filter(orig.ident==i) %>% dplyr::count(seurat_clusters) %>%
dplyr::mutate(perc=round(n/sum(n)*100, 2))
colnames(tmp)[2:ncol(tmp)] = paste0(i, ".", colnames(tmp)[2:ncol(tmp)])
return(tmp)
}) %>%
purrr::reduce(dplyr::full_join, by="seurat_clusters")
tbl$seurat_clusters = paste0("Cluster_", tbl$seurat_clusters)
tbl = tbl %>% tibble::column_to_rownames(var="seurat_clusters")
# Add enrichment
if (length(cell_samples) > 1) {
# Add enrichment
tbl = cbind(tbl, cells_fisher(sc))
# Sort columns
tbl_o = lapply(cell_samples, function(s) paste0(s, ".", c("n", "perc", "oddsRatio", "p"))) %>% unlist()
tbl = tbl[, tbl_o]
}
# Print
knitr::kable(tbl, align="l", caption="Number of cells per cluster per sample") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| pbmc_10x.n | pbmc_10x.perc | pbmc_10x.oddsRatio | pbmc_10x.p | pbmc_smartseq2.sample1.n | pbmc_smartseq2.sample1.perc | pbmc_smartseq2.sample1.oddsRatio | pbmc_smartseq2.sample1.p | |
|---|---|---|---|---|---|---|---|---|
| Cluster_1 | 99 | 22.45 | 1.05 | 4.3e-01 | 67 | 21.61 | 0.95 | 6.4e-01 |
| Cluster_2 | 74 | 16.78 | 0.84 | 8.4e-01 | 60 | 19.35 | 1.19 | 2.1e-01 |
| Cluster_3 | 60 | 13.61 | 0.71 | 9.6e-01 | 56 | 18.06 | 1.4 | 6.0e-02 |
| Cluster_4 | 83 | 18.82 | 2.24 | 1.9e-04 | 29 | 9.35 | 0.45 | 1.0e+00 |
| Cluster_5 | 51 | 11.56 | 1.49 | 7.3e-02 | 25 | 8.06 | 0.67 | 9.6e-01 |
| Cluster_6 | 34 | 7.71 | 0.62 | 9.8e-01 | 37 | 11.94 | 1.62 | 3.5e-02 |
| Cluster_7 | 35 | 7.94 | 1.25 | 2.7e-01 | 20 | 6.45 | 0.8 | 8.2e-01 |
| Cluster_8 | 5 | 1.13 | 0.21 | 1.0e+00 | 16 | 5.16 | 4.74 | 1.1e-03 |
How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.
# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score") +
AddStyle()
p2 = ggplot(sc@meta.data %>%
dplyr::group_by(seurat_clusters,Phase) %>%
dplyr::summarise(num_reads=length(Phase)),
aes(x=seurat_clusters, y=num_reads, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells") +
AddStyle()
p = p1 + p2 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases")
p# UMAP with phases superimposed
# Note: This is a hack to colour by phase but label by Cluster
p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1) +
AddStyle(title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data),]
p = LabelClusters(p, id = "seurat_clusters")
pp = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", label=TRUE, cols=c("lightgrey", param$col)) +
AddStyle(title="UMAP, cells coloured by S phase")
pp = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col), label=TRUE) +
AddStyle(title="UMAP, cells coloured by G2M phase")
pp = Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col), label=TRUE) +
AddStyle(title="UMAP, cells coloured by CC.Difference")
pDo cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
p = Seurat::FeaturePlot(sc, features="nCount_RNA", cols=c("lightgrey", param$col), label=TRUE) +
AddStyle(title="nCount_RNA")
pDo cells in individual clusters express provided known marker genes?
if (!is.null(param$file_known_markers)) {
# Read known marker genes and map to rownames
known_markers = openxlsx::read.xlsx(param$file_known_markers)
known_markers_list = lapply(colnames(known_markers), function(x) {
y = ensembl_to_seurat_rowname[known_markers[,x]] %>%
na.exclude() %>% unique() %>% sort()
m = !y %in% rownames(sc)
if (any(m)){
Warning(paste0("The following genes of marker list '", x, "' cannot be found in the data: ", first_n_elements_to_string(y[m], n=10)))
}
return(y[!m])
})
names(known_markers_list) = colnames(known_markers)
is_empty = lapply(known_markers_list,length)==0
known_markers_list = known_markers_list[!is_empty]
# Set plot options
known_markers_vect = unlist(known_markers_list) %>% unique() %>% sort()
idx_dotplot = sapply(1:length(known_markers_list), function(x) length(known_markers_list[[x]]) <=50)
idx_avgplot = sapply(1:length(known_markers_list), function(x) length(known_markers_list[[x]]) >= 10)
known_markers_n = length(known_markers_list)
} else {
known_markers_n = 0
idx_dotplot = idx_avgplot = FALSE
known_markers_vect = known_markers_list = c()
}# Dotplots and average feature plots
# The height of 1 row (= 1 plot) is fixed to 5
fig_height_knownMarkers_dotplot = max(5, 5 * sum(idx_dotplot))
fig_height_knownMarkers_avgplot = max(5, 5 * sum(idx_avgplot))
# Individual feature plots
# Each row contains 2 plots
# We fix the height of each plot to the same height as is used later for DEGs
cluster_all = levels(Idents(sc))
height_per_row = max(2, 0.3 * length(cluster_all))
nr_rows = ceiling(length(known_markers_vect)/2)
fig_height_knownMarkers_vect = max(5, height_per_row * nr_rows)You provided 7 list(s) of known marker genes. In the following tabs, you find:
A dot plot visualizes how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that express the gene, while the color encodes the average expression across all cells within the cluster.
if ((known_markers_n > 0) & any(idx_dotplot)) {
known_markers_dotplot = known_markers_list[idx_dotplot]
p_list = list()
for (i in seq(known_markers_dotplot)) {
g = known_markers_dotplot[[i]]
g = g[length(g):1]
p_list[[i]] = Seurat::DotPlot(sc, features=g, cols=c("lightgrey", param$col)) +
AddStyle(title=paste("Known marker genes:", names(known_markers_dotplot)[i]),
ylab="Cluster") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
}
p = patchwork::wrap_plots(p_list, ncol=1)
print(p)
} else if ((known_markers_n > 0) & !any(idx_dotplot)) {
message("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
} else {
message("No known marker genes were provided and hence dot plots are skipped.")
}An average feature plot visualizes the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.
if ((known_markers_n > 0) & any(idx_avgplot)) {
known_markers_avgplot = known_markers_list[idx_avgplot]
sc = Seurat::AddModuleScore(sc, features=known_markers_avgplot, assay="SCT", ctrl=10, name="known_markers")
idx_replace_names = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
p_list = Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(known_markers_avgplot)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=paste("Known marker genes:", names(known_markers_avgplot)[i]))
}
p = patchwork::wrap_plots(p_list, ncol=1)
print(p)
} else if ((known_markers_n > 0) & !any(idx_avgplot)) {
message("This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
} else {
message("No known marker genes were provided and hence average feature plots are skipped.")
}× (Message) This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.
An individual feature plot colours single cells on the UMAP according to their normalised gene expression.
if ((known_markers_n > 0) & length(known_markers_vect) <= 30) {
p_list = Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p = patchwork::wrap_plots(p_list, ncol=2)
print(p)
} else if (length(known_markers_vect) > 30) {
message("This tab is used to plot up to 30 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
} else {
message("No known marker genes were provided and hence individual feature plots are skipped.")
}We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw “RNA” data and the method “MAST”. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
# We load and unload the MAST R package in this chunk, as it overwrites Seurat functions
suppressPackageStartupMessages(library(MAST))
# Find markers for every cluster compared to all remaining cells, report positive and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
sc_markers = suppressMessages(Seurat::FindAllMarkers(sc, assay="RNA", test.use="MAST",
only.pos=FALSE, min.pct=0.25, logfc.threshold=0.25,
latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE))
# Convert natural log to log2
# https://github.com/satijalab/seurat/issues/3346#issue-672668498
# log_a(b) * log_c(a) = log_c(b)
# log_n(fc) = log_2(fc) * log_n(2)
# log_2(fc) = log_n(fc) / log_n(2)
sc_markers$avg_lognfc = sc_markers$avg_logFC
sc_markers$avg_log2fc = sc_markers$avg_lognfc / log(2)
# Introduce signed p-value score and sort table
sc_markers$p_val_adj_score = -log2(sc_markers$p_val_adj) * sign(sc_markers$avg_log2fc)
sc_markers = sc_markers %>%
dplyr::arrange(cluster, -p_val_adj_score, -avg_log2fc) %>%
as.data.frame()
# Filter markers based on p-value and fold-change
sc_markers_filt = sc_markers %>%
dplyr::filter(p_val_adj <= param$padj) %>%
dplyr::filter((avg_log2fc <= -param$log2fc) | (avg_log2fc >= param$log2fc)) %>%
as.data.frame()
# Separate up- and down-regulated genes
sc_markers_filt_down = sc_markers_filt %>%
dplyr::filter(avg_log2fc <= -param$log2fc) %>%
as.data.frame()
sc_markers_filt_up = sc_markers_filt %>%
dplyr::filter(avg_log2fc >= param$log2fc) %>%
as.data.frame()
# Get top 3 markers
sc_markers_filt_top3 = sc_markers_filt %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=3, wt=p_val_adj_score) %>%
dplyr::select(cluster, gene, avg_log2fc, p_val, p_val_adj, pct.1, pct.2) %>%
as.data.frame()
# Format columns of top 3 markers
sc_markers_filt_top3$p_val = formatC(as.numeric(sc_markers_filt_top3$p_val), format="e", digits=1)
sc_markers_filt_top3$p_val_adj = formatC(as.numeric(sc_markers_filt_top3$p_val_adj), format="e", digits=1)
sc_markers_filt_top3$avg_log2fc = round(sc_markers_filt_top3$avg_log2fc, digits=3)
# Show top 3 markers per cluster
knitr::kable(sc_markers_filt_top3, align="l", caption="Top 3 DEGs per cell cluster") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")| cluster | gene | avg_log2fc | p_val | p_val_adj | pct.1 | pct.2 |
|---|---|---|---|---|---|---|
| 1 | GZMH | 2.292 | 4.0e-83 | 7.5e-79 | 0.982 | 0.236 |
| 1 | NKG7 | 1.908 | 3.2e-69 | 6.0e-65 | 1.000 | 0.475 |
| 1 | GZMB | 1.378 | 8.3e-66 | 1.6e-61 | 0.910 | 0.195 |
| 2 | GZMK | 2.352 | 3.3e-42 | 6.3e-38 | 0.500 | 0.024 |
| 2 | DUSP2 | 1.690 | 3.6e-32 | 6.7e-28 | 0.843 | 0.374 |
| 2 | CST7 | 0.669 | 3.4e-27 | 6.3e-23 | 0.903 | 0.415 |
| 3 | LYZ | 6.375 | 3.0e-136 | 5.6e-132 | 0.991 | 0.044 |
| 3 | CTSS | 3.894 | 5.1e-124 | 9.5e-120 | 1.000 | 0.392 |
| 3 | SERPINA1.1 | 3.621 | 6.8e-120 | 1.3e-115 | 0.931 | 0.009 |
| 4 | IL7R | 1.570 | 1.2e-44 | 2.3e-40 | 0.955 | 0.285 |
| 4 | LTB | 1.891 | 3.1e-43 | 5.9e-39 | 0.902 | 0.282 |
| 4 | MAL | 1.160 | 3.0e-34 | 5.6e-30 | 0.509 | 0.038 |
| 5 | CD79A | 3.831 | 6.6e-102 | 1.2e-97 | 0.987 | 0.015 |
| 5 | HLA-DQA1 | 2.900 | 2.2e-79 | 4.1e-75 | 0.961 | 0.036 |
| 5 | CD74 | 3.196 | 3.3e-75 | 6.2e-71 | 1.000 | 0.794 |
| 6 | GNLY | 2.811 | 1.7e-43 | 3.2e-39 | 0.972 | 0.297 |
| 6 | KLRF1 | 2.033 | 3.9e-43 | 7.4e-39 | 0.732 | 0.041 |
| 6 | PRF1 | 2.531 | 1.9e-38 | 3.5e-34 | 1.000 | 0.368 |
| 7 | LEF1 | 1.949 | 1.1e-27 | 2.1e-23 | 0.709 | 0.070 |
| 7 | RCAN3 | 1.646 | 1.3e-21 | 2.4e-17 | 0.800 | 0.203 |
| 7 | RPS3A | 0.921 | 8.3e-21 | 1.6e-16 | 1.000 | 0.974 |
| 8 | TAGLN2 | 4.401 | 2.0e-57 | 3.7e-53 | 0.810 | 0.710 |
| 8 | MAX | 4.413 | 3.3e-53 | 6.2e-49 | 0.810 | 0.322 |
| 8 | GPX1 | 5.359 | 1.6e-50 | 3.0e-46 | 0.952 | 0.319 |
# Add Ensembl annotation
sc_markers_ensembl = seurat_rowname_to_ensembl[sc_markers_filt[,"gene"]]
sc_markers_annot = cbind(sc_markers_filt, annot_ensembl[sc_markers_ensembl,])
# Output in Excel sheet
sc_markers_lst = lapply(levels(sc_markers_annot$cluster), function(x) {sc_markers_annot %>% dplyr::filter(cluster==x)})
names(sc_markers_lst) = paste0("cluster", levels(sc_markers_filt$cluster))
openxlsx::write.xlsx(sc_markers_lst, file=paste0(param$path_out, "/markers.xlsx"))
# Number of DEGs per cluster
cluster_all = levels(Idents(sc))
sc_markers_filt_n = cbind(Down=sapply(cluster_all, function(x) sum(sc_markers_filt_down$cluster == x)),
Up=sapply(cluster_all, function(x) sum(sc_markers_filt_up$cluster == x))) %>%
as.data.frame()
sc_markers_filt_n = cbind(Cluster=factor(cluster_all, levels=cluster_all),
sc_markers_filt_n) %>%
tidyr::pivot_longer(cols=c("Down", "Up"),
names_to="Direction",
values_to="n")
p = ggplot(sc_markers_filt_n, aes(x=Cluster, y=n, fill=Direction)) +
geom_bar(stat="identity") +
AddStyle(title=paste0("Number of DEGs per cell cluster\n(FC=", 2^param$log2fc, ", adj. p-value=", param$padj, ")"),
fill=c("steelblue", "darkgoldenrod1"))
pThe following plots are exemplary to how we can visualize differentially expressed genes using the Seurat R-package. The selected genes are the top differentially expressed genes for each cluster, respectively.
# Get top gene per cluster and plot
genes_example_df = sc_markers_filt %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=1, wt=avg_log2fc) %>%
dplyr::select(cluster, gene)
genes_example = genes_example_df$gene
genes_example_titles = paste0(genes_example_df$cluster,": ",genes_example_df$gene)# Each row contains 2 plots
nr_rows = ceiling(length(genes_example)/2)
# The height of each plot might depend on the number of clusters
height_per_row = max(2, 0.3 * length(cluster_all))
# Total height of plots
fig_height_fixed = max(5, 2 * nr_rows)
fig_height_variable = max(5, height_per_row * nr_rows)# Shows gene expression on the UMAP
p_list = Seurat::FeaturePlot(sc, features=genes_example, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(title=genes_example_titles[i])
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data (assay='SCT', slot='data')")
p# Violin plot of raw gene expression counts
p_list = Seurat::VlnPlot(sc, features=genes_example, assay="RNA", slot="counts", combine=FALSE, pt.size=0.2)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(title=genes_example_titles[i],
legend_title="Cell identity",
fill=param$col_clusters,
xlab="Cluster")
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Violin plot of raw gene expression counts") +
patchwork::plot_layout(guides = "collect") & theme(legend.position="bottom")
suppressMessages(p)# Violin plot of normalised gene expression data
p_list = Seurat::VlnPlot(sc, features=genes_example, combine=FALSE, pt.size=0.2)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(title=genes_example_titles[i],
legend_title="Cell identity",
fill=param$col_clusters,
xlab="Cluster")
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Violin plot of normalised gene expression data") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
suppressMessages(p)# Ridge plot of raw gene expression counts
# Specify "RNA" assay to look at raw data
p_list = Seurat::RidgePlot(sc, features=genes_example, assay="RNA", slot="counts", combine=FALSE)
for (i in seq(p_list)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=genes_example_titles[i],
legend_title="Cell identity",
fill=param$col_clusters,
ylab="Cluster")
}
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of raw gene expression counts") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
suppressMessages(p)# Ridge plot of normalised gene expression data
# Defaults to active assay
p_list = Seurat::RidgePlot(sc, features=genes_example, slot="data", combine=FALSE)
for (i in seq(p_list)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=genes_example_titles[i],
legend_title="Cell identity",
fill=param$col_clusters,
ylab="Cluster")
}
p = patchwork::wrap_plots(p_list, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of normalised and log-transformed gene expression data") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
suppressMessages(p)# Visualises how feature expression changes across different clusters
p = Seurat::DotPlot(sc, features=unique(genes_example[length(genes_example):1]), cols=c("lightgrey", param$col)) +
AddStyle(title="Dot plot of normalised gene expression data",
ylab="Cluster") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
p# Heatmap of top differentially expressed genes
top = sc_markers_filt %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=10, wt=p_val_adj_score)
p = Seurat::DoHeatmap(sc, features=top$gene, group.colors=param$col_clusters) +
NoLegend() +
ggtitle("Heatmap of scaled gene expression data")× (Warning) Warning in Seurat::DoHeatmap(sc, features = top\(gene, group.colors = ## param\)col_clusters): The following features were omitted as they were not found ## in the scale.data slot for the SCT assay: RPL19, RPL13, RPS5, EPHX2, RPS3A, ## TPT1, HLA-C.3
To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Chen 2020). You can choose to test functional enrichment from a wide range of databases:
dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| geneCoverage | genesPerTerm | libraryName | link | numTerms |
|---|---|---|---|---|
| 13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 |
| 27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 |
| 6002 | 77 | Transcription_Factor_PPIs | 290 | |
| 47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 |
| 47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 |
| 21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 |
| 1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 |
| 3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 |
| 2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 |
| 15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 |
| 34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 |
| 7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 |
| 16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 |
| 12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 |
| 23726 | 127 | GeneSigDB | http://genesigdb.org/genesigdb/downloadall.jsp | 2139 |
| 32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 |
| 13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 |
| 19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 |
| 13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 |
| 14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 |
| 3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 |
| 22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 |
| 4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 |
| 10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 |
| 2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 |
| 5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 |
| 10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 |
| 10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 |
| 11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 |
| 8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 |
| 1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 |
| 2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 |
| 851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 |
| 10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 |
| 11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 |
| 15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 |
| 12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 |
| 13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 |
| 6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 |
| 3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 |
| 7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 |
| 7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 |
| 7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 |
| 8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 |
| 13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 |
| 26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 |
| 29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 |
| 280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 |
| 13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 |
| 15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 |
| 4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 |
| 4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 |
| 10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 |
| 1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 |
| 756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 |
| 3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 |
| 2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 |
| 1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 |
| 5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 |
| 6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 |
| 25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 |
| 19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 |
| 23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 |
| 24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 |
| 31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 |
| 5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 |
| 9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 |
| 9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 |
| 16725 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_down | http://www.gtexportal.org/ | 2918 |
| 19249 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_up | http://www.gtexportal.org/ | 2918 |
| 15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 |
| 3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 |
| 12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 |
| 12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 |
| 8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 |
| 7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 |
| 5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 |
| 15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | |
| 17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 |
| 934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 |
| 2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 |
| 2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 |
| 5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 |
| 49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 |
| 2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 |
| 19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 |
| 22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 |
| 8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 |
| 18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 |
| 15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 |
| 10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 |
| 10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 |
| 10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 |
| 13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 |
| 8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 |
| 10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 |
| 13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 |
| 21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 |
| 23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 |
| 20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 |
| 19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 |
| 25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 |
| 19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 |
| 14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 |
| 17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 |
| 5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 |
| 12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 |
| 1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 |
| 19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 |
| 14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 |
| 8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 |
| 11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 |
| 19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 |
| 27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 |
| 13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 |
| 13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 |
| 10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 |
| 19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 |
| 6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 |
| 4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 |
| 3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 |
| 7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 |
| 8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 |
| 12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 |
| 9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 |
| 7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 |
| 6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 |
| 13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 |
| 14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 |
| 9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 |
| 1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 |
| 9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 |
| 17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 |
| 394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 |
| 11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 |
| 8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 |
| 18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 |
| 5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 |
| 5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 |
| 14156 | 40 | Table_Mining_of_CRISPR_Studies | 802 | |
| 16979 | 295 | COVID-19_Related_Gene_Sets | https://amp.pharm.mssm.edu/covid19 | 205 |
# DEGs up and down per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
genesets_up = lapply(cluster_all, function(x) {
tmp = sc_markers_filt_up %>%
dplyr::filter(cluster==x) %>%
dplyr::pull(gene)
# Pick the first matching Entrez symbol
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>%
na.exclude() %>% unique()
return(tmp)
})
genesets_down = lapply(cluster_all, function(x) {
tmp = sc_markers_filt_down %>%
dplyr::filter(cluster==x) %>%
dplyr::pull(gene)
# Pick the first matching Entrez symbol
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>%
na.exclude() %>% unique()
return(tmp)
})
names(genesets_up) = paste0("DEG_up_cluster_", cluster_all)
names(genesets_down) = paste0("DEG_down_cluster_", cluster_all)
genesets = c(genesets_up, genesets_down)
# Loop through gene lists
enriched = list()
for (i in seq(genesets)) {
if (length(genesets[[i]]) >= 3) {
enriched[[i]] = enrichR::enrichr(genesets[[i]], databases=param$enrichr_dbs)
enriched[[i]] = purrr::map(enriched[[i]], function(df) {
df %>% dplyr::filter(Adjusted.P.value < param$p_enrichr)
})
} else {
message("Geneset ", names(genesets)[i], " has less than 3 genes; skip enrichr")
enriched[[i]] = NULL
}
}
names(enriched) = names(genesets)
# Write enrichment results to file
enriched_top = matrix(NA, nrow=0, ncol=6)
colnames(enriched_top) = c("GeneSet", "Database", "Term", "Overlap", "Adjusted_pval", "Genes")
for (i in seq(enriched)) {
if (!is.null(enriched[[i]])) {
openxlsx::write.xlsx(enriched[[i]], file=paste0(param$path_out, "/Functions_", names(enriched)[i], ".xlsx"))
}
}The following table contains the top enriched term per geneset and database.
for (i in seq(enriched)) {
if (!is.null(enriched[[i]])) {
# Remember top term per geneset
for (j in seq(enriched[[i]])) {
enriched_top = rbind(enriched_top,
c(names(enriched)[i],
names(enriched[[i]])[j],
enriched[[i]][[j]][1, c("Term", "Overlap", "Adjusted.P.value", "Genes")]))
}
}
}
# Print table of top terms per gene set
knitr::kable(enriched_top, align="l", caption="Top enriched term per geneset") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")| GeneSet | Database | Term | Overlap | Adjusted_pval | Genes |
|---|---|---|---|---|---|
| DEG_up_cluster_1 | GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | 3/6 | 0.00236524803711237 | LCK;CD3G;CD3E |
| DEG_up_cluster_1 | GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | 18/251 | 7.75365345786931e-13 | ITGB1;ZNF683;CD81;TRAC;HLA-B;HLA-C;CD3G;CD3E;ITGAL;SLA2;CD3D;CD8B;CD8A;TRBC1;KLRD1;B2M;HCST;CD99 |
| DEG_up_cluster_1 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 6/16 | 3.36278220581764e-08 | ZAP70;CD8B;CD8A;CD3G;CD3E;CD3D |
| DEG_up_cluster_2 | GO_Molecular_Function_2018 | NA | NA | NA | NA |
| DEG_up_cluster_2 | GO_Biological_Process_2018 | NA | NA | NA | NA |
| DEG_up_cluster_2 | GO_Cellular_Component_2018 | NA | NA | NA | NA |
| DEG_up_cluster_3 | GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | 30/313 | 0.000191778438016194 | RAB1A;TWF2;CAPG;ENO1;IQGAP1;PSEN1;SDCBP;LASP1;SNX2;PCBP1;PRDX1;CTNNA1;LRRFIP1;CLIC1;S100A11;VASP;ANXA1;JUP;ANXA2;STAT1;RAB10;SHTN1;EIF5;CAPZA1;NUMB;PLIN3;ALDOA;EIF4G2;PICALM;PLEC |
| DEG_up_cluster_3 | GO_Biological_Process_2018 | neutrophil activation involved in immune response (GO:0002283) | 136/483 | 6.40896800436883e-86 | FCN1;CDA;CYFIP1;RAB3D;PYCARD;LGALS3;ANPEP;FTH1;LAMP2;TOM1;COTL1;SIRPA;CD93;CD300A;SLC11A1;CYBB;RNASE2;OSCAR;RAB31;TYROBP;DOK3;CRISPLD2;S100A9;S100A8;FTL;CFD;BRI3;STXBP2;C5AR1;FPR1;MGST1;IQGAP1;PSEN1;CFP;GNS;SDCBP;COMMD9;ALDH3B1;PSAP;S100A12;NCKAP1L;LTA4H;CD14;S100A11;JUP;FUCA1;FUCA2;ADA2;ARPC5;LYZ;BST2;BST1;SELL;MAN2B1;SIGLEC9;SERPINA1;ITGAM;HEXB;ITGB2;CTSZ;GDI2;SLC2A3;PYGL;TCIRG1;FCAR;CTSS;SIRPB1;HK3;GLIPR1;GM2A;TIMP2;ITGAX;CTSH;QSOX1;CD36;RAC1;GUSB;CTSD;CD33;CTSC;CTSB;CAP1;ACTR2;SERPINB1;MANBA;FCER1G;ANXA2;SYK;NFAM1;PGAM1;RNASET2;GAA;ATP6AP2;RHOG;PLAUR;TNFRSF1B;RHOA;CKAP4;SERPINB6;FGR;RAP2B;NPC2;CAT;PECAM1;PLEKHO2;ALDOA;TLR2;CSTB;CD63;ASAH1;GRN;CLEC12A;GSTP1;PTAFR;FGL2;PRCP;LILRA2;CST3;ALOX5;CREG1;CD55;GCA;CMTM6;LILRB2;LILRB3;CPPED1;RAB10;ERP44;FCGR2A;IMPDH1;GLB1;P2RX1;QPCT;PTPN6;MNDA;CD68 |
| DEG_up_cluster_3 | GO_Cellular_Component_2018 | ficolin-1-rich granule (GO:0101002) | 59/184 | 1.81163569520284e-39 | FCN1;CDA;SERPINA1;CTSZ;ITGB2;SLC2A3;PYGL;TCIRG1;CTSS;FCAR;LGALS3;HK3;FTH1;LAMP2;COTL1;TIMP2;ITGAX;SIRPA;CTSH;RAC1;GUSB;CTSD;CTSB;ACTR2;FCER1G;CD93;CD300A;SLC11A1;PGAM1;GAA;ATP6AP2;RHOA;SERPINB6;DOK3;CRISPLD2;CAT;PLEKHO2;ALDOA;CFD;CSTB;ASAH1;GSTP1;FGL2;FPR1;PRCP;GNS;CST3;COMMD9;ALOX5;NCKAP1L;LTA4H;CD55;JUP;ARPC5;LILRB2;GLB1;IMPDH1;QPCT;MNDA |
| DEG_up_cluster_4 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 42/1387 | 2.61365448398768e-18 | RPL30;RPL32;RPL10;RPL34;RPL12;RPL11;SAMSN1;ZFP36L2;GSPT1;RPS16;RPS15A;RPS19;RPL18A;RPS18;RPL36;RPL14;RPL35;RPL13;RPL38;RPL37;RPS2;RPS27A;RPL39;RPS13;HNRNPA0;RPL41;RPL22;RPL35A;RPSA;RPL23A;RPS25;RPS28;RPS27;RCAN3;RPL37A;RPL24;EIF3E;VIM;RPL29;RPL28;RPS21;TPT1 |
| DEG_up_cluster_4 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 36/89 | 1.41535570725476e-56 | RPL30;RPL32;RPL10;RPL34;RPLP1;RPL12;RPL11;RPS16;RPS15A;RPL18A;RPS19;RPS18;RPL36;RPL14;RPL35;RPL13;RPL38;RPL37;RPS2;RPS27A;RPL39;RPS13;RPL41;RPL22;RPL35A;RPSA;RPL23A;RPS25;RPS28;RPS27;RPS29;RPL37A;RPL24;RPL29;RPL28;RPS21 |
| DEG_up_cluster_4 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 37/124 | 1.98947868648437e-53 | RPL30;RPL32;RPL10;RPL34;RPLP1;RPL12;RPL11;RPS16;RPS15A;RPS19;RPL18A;RPS18;RPL36AL;RPL36;RPL14;RPL35;RPL13;RPL38;RPL37;RPS2;RPS27A;RPL39;RPS13;RPL41;RPL22;RPL35A;RPSA;RPL23A;RPS25;RPS28;RPS27;RPS29;RPL37A;RPL24;RPL29;RPL28;RPS21 |
| DEG_up_cluster_5 | GO_Molecular_Function_2018 | MHC class II receptor activity (GO:0032395) | 7/10 | 2.59939577769911e-10 | HLA-DRA;HLA-DOA;HLA-DQA2;HLA-DQA1;HLA-DQB2;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_5 | GO_Biological_Process_2018 | antigen receptor-mediated signaling pathway (GO:0050851) | 25/257 | 2.36688229410734e-16 | BLK;IGHM;IGHG3;CD79B;CD79A;IGHG1;IGKC;CD19;IGLC3;PDE4B;IGLC2;IGHA1;HLA-DQA2;HLA-DQA1;HLA-DPA1;MEF2C;HLA-DRB5;NFKBIA;BCL2;IGHD;HLA-DPB1;HLA-DRA;HLA-DRB1;HLA-DQB2;HLA-DQB1 |
| DEG_up_cluster_5 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 12/14 | 1.95133997445852e-21 | CD74;HLA-DRB5;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DOA;HLA-DQA1;HLA-DQB2;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_6 | GO_Molecular_Function_2018 | non-membrane spanning protein tyrosine kinase activity (GO:0004715) | 5/43 | 0.00219944480793806 | ZAP70;TXK;MATK;SLA2;JAK1 |
| DEG_up_cluster_6 | GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | 19/251 | 4.57299786278372e-14 | IFITM1;KLRB1;ITGB2;SH2D1B;KIR3DL1;KIR2DL1;KIR3DL2;ITGAL;SLA2;KIR2DL3;NCR1;FCGR3A;NCR3;TYROBP;KLRF1;SLAMF7;KLRD1;KLRC1;CD247 |
| DEG_up_cluster_6 | GO_Cellular_Component_2018 | specific granule membrane (GO:0035579) | 5/91 | 0.0343372714243856 | ITGAM;ITGB2;ADAM8;ITGAL;SLC15A4 |
| DEG_up_cluster_7 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 42/1387 | 1.9189950564277e-24 | RPL4;RPL5;RPL3;RPL32;RPL10;RPL31;RPLP0;RPL11;RPL10A;RPS4Y1;RPL9;RPL7;RPS4X;RPS14;RPL7A;RPS16;RPS18;RACK1;RPS3;RPL14;RPL13;RPL15;RPL18;RPS13;RPL19;RPS9;NPM1;RPS8;RPS5;RPS6;NOSIP;RPL13A;RPL35A;RPS3A;RPSA;EEF1A1;RPS25;RCAN3;EIF3E;RPS20;RPL29;TPT1 |
| DEG_up_cluster_7 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 36/89 | 3.39472958505006e-62 | RPL4;RPL5;RPL3;RPL32;RPL10;RPL31;RPLP0;RPL11;RPL10A;RPS4Y1;RPL9;RPL7;RPS4X;RPS14;RPL7A;RPS16;RPS18;RPS3;RPL14;RPLP2;RPL13;RPL15;RPL18;RPS13;RPL19;RPS9;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPS3A;RPSA;RPS25;RPS20;RPL29 |
| DEG_up_cluster_7 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 37/124 | 3.12011197124062e-59 | RPL4;RPL5;RPL3;RPL32;RPL10;RPL31;RPLP0;RPL11;RPL10A;RPS4Y1;RPL9;RPL7;RPS4X;RPS14;RPL7A;RPS16;RPS18;RACK1;RPS3;RPL14;RPLP2;RPL13;RPL15;RPL18;RPS13;RPL19;RPS9;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPS3A;RPSA;RPS25;RPS20;RPL29 |
| DEG_up_cluster_8 | GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | 15/313 | 0.0105676188468544 | ITGB1;ENO1;PARK7;PRDX6;YWHAZ;PDLIM1;HNRNPK;PKM;CTTN;CAPZA1;MYH9;TAGLN2;TLN1;ALDOA;VCL |
| DEG_up_cluster_8 | GO_Biological_Process_2018 | platelet degranulation (GO:0002576) | 23/124 | 5.23390338728804e-17 | SRGN;APP;SPARC;WDR1;ACTN1;ITGB3;ITGA2B;STXBP2;ENDOD1;F13A1;PPBP;CLU;THBS1;TUBA4A;TMSB4X;CD9;TAGLN2;TIMP1;TLN1;ALDOA;VCL;FERMT3;PF4 |
| DEG_up_cluster_8 | GO_Cellular_Component_2018 | platelet alpha granule (GO:0031091) | 16/90 | 8.1357810642281e-12 | SRGN;APP;SPARC;ITGB3;ACTN1;ITGA2B;F13A1;PPBP;CLU;THBS1;TMSB4X;CD9;TIMP1;ALDOA;PF4;FERMT3 |
| DEG_down_cluster_1 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 3/16 | 0.0204886175256104 | CD74;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_1 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 16/479 | 1.02799196908971e-08 | FCER1G;SLC11A1;SLC2A3;CFP;CST3;LGALS3;SDCBP;TYROBP;SELL;NPC2;FTH1;COTL1;PECAM1;CTSH;S100A11;CD55 |
| DEG_down_cluster_1 | GO_Cellular_Component_2018 | tertiary granule (GO:0070820) | 9/164 | 1.29376452083819e-06 | CST3;LGALS3;FCER1G;FTH1;SLC11A1;CTSH;SLC2A3;CFP;CD55 |
| DEG_down_cluster_2 | GO_Molecular_Function_2018 | NA | NA | NA | NA |
| DEG_down_cluster_2 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 6/97 | 4.64696592860875e-05 | FCER1G;AP1S2;AP2S1;HLA-DRA;CTSS;HLA-DRB1 |
| DEG_down_cluster_2 | GO_Cellular_Component_2018 | lysosome (GO:0005764) | 8/422 | 0.000110636966202871 | ASAH1;SNX2;PSAP;AP1S2;HLA-DRA;CTSS;HLA-DRB1;CTSB |
| DEG_down_cluster_3 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 82/1387 | 1.02898103718102e-28 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP0;RPL10A;RPL6;SYNE1;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPL18A;RPS19;RPS18;RPL36;RPL35;RPL38;RPL37;RPL39;RPS10;SNRPN;RPL21;RPS7;RPS8;RPS5;RPL22;RPS6;CIRBP;RPL13A;RPSA;RPS3A;EEF1A1;EEF1D;RPL37A;RPL24;LUC7L3;RPL27;RPL29;RPL28;SRSF7;DDX6;RPL10;DDX24;RPL12;RPL11;RPS4Y1;ZFP36L2;RPS15A;RPL14;RPS3;RPL13;RPS2;RPL15;RPL18;RPS27A;LYAR;RPL19;RBM38;RPL41;JUN;NPM1;PRRC2C;RPL35A;RPL23A;NAP1L4;RPS26;RPS25;RPS28;RPS27;RPL27A;RPS20;CALR;FAU;RPS21;NOP53;RPS23;TNRC6B |
| DEG_down_cluster_3 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 65/89 | 5.33732848687609e-101 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL10A;RPL6;RPS4X;RPS15;RPL7A;RPS14;RPS16;RPL18A;RPS19;RPS18;RPL36;RPL35;RPLP2;RPL38;RPL37;RPL39;RPS10;RPL21;RPS7;RPS8;RPS5;RPL22;RPS6;RPL13A;RPS3A;RPSA;RPL37A;RPL24;RPL27;RPL29;RPL28;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL27A;RPS20;RPS21;RPS23 |
| DEG_down_cluster_3 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 66/124 | 1.24053506647338e-89 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL10A;RPL6;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPL18A;RPL36AL;RPS18;RPL36;RPL35;RPLP2;RPL38;RPL37;RPL39;RPS10;RPL21;RPS7;RPS8;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;RPL29;RPL28;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL27A;RPS20;RPS21;RPS23 |
| DEG_down_cluster_4 | GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | 5/19 | 1.54292851897429e-06 | CD74;HLA-DMA;HLA-DRA;KLRD1;HLA-DRB1 |
| DEG_down_cluster_4 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 10/97 | 4.57948881759675e-10 | CD74;HLA-DMA;HLA-DRB5;FCER1G;HLA-DPB1;HLA-DRA;CTSD;CTSS;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_4 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 7/14 | 1.05264902600997e-12 | CD74;HLA-DRB5;HLA-DMA;HLA-DPB1;HLA-DRA;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_5 | GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | 4/6 | 0.000125917393275235 | LCK;CD3G;CD3E;HLA-E |
| DEG_down_cluster_5 | GO_Biological_Process_2018 | T cell activation (GO:0042110) | 16/88 | 7.31287199564278e-13 | ITK;MSN;CD3G;CD3E;SLA2;CD3D;CD2;ZAP70;CASP8;CD8B;LCK;CD8A;CD7;FYN;LCP1;LAT |
| DEG_down_cluster_5 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 8/16 | 2.70506704128307e-10 | ZAP70;CD6;CD8B;CD8A;CD3G;CD247;CD3E;CD3D |
| DEG_down_cluster_6 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 18/1387 | 1.07745994943648e-09 | RPL32;RPS8;RPL11;RPL8;RPL9;RPS15;RPS16;RPL18A;RPS19;RPS18;RCAN3;RPL13;VIM;PABPC1;RPL18;RPL28;RPS13;TPT1 |
| DEG_down_cluster_6 | GO_Biological_Process_2018 | nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) | 15/112 | 5.46027709504545e-22 | RPL32;RPS8;RPL11;RPL8;RPL9;RPS15;RPS16;RPL18A;RPS19;RPS18;RPL13;PABPC1;RPL18;RPL28;RPS13 |
| DEG_down_cluster_6 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 14/124 | 3.25228285628568e-20 | RPL32;RPS8;RPL11;RPL8;RPL9;RPS15;RPS16;RPL18A;RPS19;RPS18;RPL13;RPL18;RPL28;RPS13 |
| DEG_down_cluster_7 | GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | 12/313 | 3.47522714685567e-06 | ITGB1;CAST;ANXA1;ANXA2;AHNAK;EFHD2;FLNA;IQGAP1;PFN1;CLIC1;YWHAZ;S100A11 |
| DEG_down_cluster_7 | GO_Biological_Process_2018 | cytokine-mediated signaling pathway (GO:0019221) | 20/633 | 1.37337276075139e-09 | ITGB1;HLA-DRB5;ANXA1;ANXA2;IFNGR1;IL10RA;HLA-B;MSN;HLA-A;TNFRSF1B;YWHAZ;CCL5;CFL1;UBC;HLA-DPB1;HLA-DRA;MAP3K8;LCP1;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_7 | GO_Cellular_Component_2018 | MHC protein complex (GO:0042611) | 8/18 | 6.99176446090177e-13 | CD74;HLA-DRB5;HLA-B;HLA-DPB1;HLA-DRA;HLA-A;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_8 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 223/1387 | 6.11681849993887e-109 | RPL4;EIF4A2;RPL5;EIF4A1;RPL30;RPL3;RPL32;RPL31;RPL34;HNRNPU;HNRNPR;RPL8;RPL9;RPL6;RPL7;RPS15;PNN;RPS14;RPS17;LGALS1;RPS16;SNRPD2;RPS19;RPL18A;RPS18;RPL36;RPL35;ARL6IP4;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;DDX17;SRRM2;RPL21;SCAF11;RPL23;RPL22;CIRBP;SRRM1;FAM133B;EWSR1;SUB1;RPL24;RPL27;RPL26;RPL29;EZR;PFN1;RPL28;SF1;BAZ2A;ZC3HAV1;RPS4Y1;MTDH;PCBP2;BTF3;PRPF38B;JUN;RPL41;XRCC6;XRCC5;PRRC2C;EIF1;HNRNPL;RPS26;HNRNPM;RPS25;RPS28;EIF5;RPS27;RPL27A;HNRNPD;RPS20;HNRNPC;CALR;RPS21;RPS24;RPS23;SMG1;RPLP0;RACK1;SEC61B;ANXA2;SRP9;NSA2;HNRNPH1;CANX;STRAP;ARHGEF1;PABPC1;SF3B2;AHNAK;RPL10;SF3B6;RPL12;RPL11;RPS27L;EXOSC6;BCLAF1;RPS15A;RPL14;RPS3;RPL13;RPS2;RPL15;RPL18;RPS27A;SRSF10;SF3B1;SRSF11;RPL19;NPM1;MDH2;RSRC2;RSL1D1;SERBP1;FAU;PEBP1;RPL10A;SRP14;BZW1;ZFP36;ZNF207;HMGN2;PDIA3;CAST;PNISR;RPS9;RPS7;RPS8;RPS5;RPS6;PRPF4B;RPSA;TUFM;EEF1A1;ILF3;HNRNPUL1;NCL;SRSF2;SBDS;SRSF3;SRSF4;S100A4;SRSF5;SNRNP200;PPIB;SRSF7;PPIA;KPNB1;DDX6;DDX5;RBM8A;CORO1A;ZFP36L2;HSP90B1;HNRNPDL;UBC;HSPA9;HSPA8;UBE2I;FUS;NONO;TBCA;EEF2;SON;EIF2S3;RSL24D1;RBM25;HSP90AB1;DDX3X;CELF2;HMGB2;ADAR;YBX1;SYNE1;RBM3;RPS4X;RPL7A;SUMO1;SUMO2;HSP90AA1;NCBP1;RPL13A;RPS3A;SFPQ;NEMF;SYF2;THRAP3;EEF1D;RPL37A;LUC7L3;SLC25A5;DDX24;RPL36A;TRA2B;TRA2A;EIF4H;SAP18;HNRNPA1;EIF4B;HNRNPA0;RBM39;HNRNPA3;CNBP;RPL35A;RPL23A;EIF3L;EIF3G;EIF3H;EIF3E;SSBP1;VIM;EIF3D;NOP53;RBMX;EIF3A;TPT1;RAN;TNRC6B |
| DEG_down_cluster_8 | GO_Biological_Process_2018 | protein targeting to ER (GO:0045047) | 83/97 | 1.05079812652634e-111 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;SRP14;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS17;RPS16;RPS19;RPL18A;RPS18;SEC61G;RPL36;RPL35;RPLP2;RPL38;RPL37;SEC62;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;SRP9;RPL37A;RPL24;RPL27;RPL26;RPL29;RPL28;UBA52;RPL10;RPL12;RPL11;RPL36A;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;SPCS2;RPS27;SPCS1;RPS29;RPL27A;RPS20;RPS21;RPS24;RPS23 |
| DEG_down_cluster_8 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 82/124 | 1.53990078618172e-94 | RPL4;RPL5;RPL30;RPL3;DDX3X;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS17;RPS16;RPS19;RPL18A;RPL36AL;RPS18;RPL36;RACK1;RPL35;RPLP2;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;RPL26;RPL29;RPL28;RPL10;RPL12;RPL11;RPL36A;RPS27L;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RSL1D1;RPS28;RPS27;RPS29;RPL27A;RPS20;RPS21;RSL24D1;RPS24;RPS23 |
We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.
# Export UMAP coordinates
loupe_umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe_umap = cbind(Barcode=rownames(loupe_umap), loupe_umap)
colnames(loupe_umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe_umap, file=paste0(param$path_out, "/Seurat2Loupe_umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export categorical metadata
loupe_meta = as.data.frame(sc@meta.data)
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
loupe_meta = cbind(Barcode=rownames(loupe_meta), loupe_meta[, idx_keep])
write.table(x=loupe_meta, file=paste0(param$path_out, "/Seurat2Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets
loupe_genesets = data.frame(List=paste0("DEG_up_cluster_", sc_markers_filt_up[,"cluster"]),
Name=sc_markers_filt_up[,"gene"],
Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_up[,"gene"]])
loupe_genesets = rbind(loupe_genesets,
data.frame(List=paste0("DEG_down_cluster_", sc_markers_filt_down[,"cluster"]),
Name=sc_markers_filt_down[,"gene"],
Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_down[,"gene"]]))
genesets_to_export = list(genes_cc_s_phase=genes_s[,2], genes_cc_g2m_phase=genes_g2m[,2])
for (i in names(genesets_to_export)) {
tmp_genes = genesets_to_export[[i]]
tmp_genes = tmp_genes[tmp_genes %in% names(symbol_to_ensembl)]
loupe_genesets = rbind(loupe_genesets,
data.frame(List=i,
Name=tmp_genes,
Ensembl=seurat_rowname_to_ensembl[tmp_genes]))
}
write.table(loupe_genesets, file=paste0(param$path_out, "/Seurat2Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")We export the assay data, clustering, visualisation, marker genes and enriched pathways in a format that can be read by the Cerebro Browser (romanhaa 2020)
# top expressed genes
sc = cerebroApp::getMostExpressedGenes(sc, column_cluster="seurat_clusters", column_sample="orig.ident", assay=Seurat::DefaultAssay(sc))
gene_lists_for_cerebro = split(sc_markers_filt$gene,sc_markers_filt$cluster)
names(gene_lists_for_cerebro) = paste("Marker cluster",names(gene_lists_for_cerebro))
gene_lists_for_cerebro[["G2M_phase_genes"]] = genes_g2m[,2]
gene_lists_for_cerebro[["S_phase_genes"]] = genes_s[,2]
gene_lists_for_cerebro[["mitochondrial_genes"]] = grep(param$mt,rownames(sc), v=TRUE)
cerebro_species = gsub("_gene_ensembl", "", param$mart_dataset)
cerebro_species = ifelse(grepl("sapiens", cerebro_species), "Hg", ifelse(grepl("musculus", "Mm", cerebro_species)))
res = ExportToCerebro(sc=sc, path=paste0(param$path_out,"/cerebro.crb"), param=param, project=param$project_id, species=cerebro_species, assay=DefaultAssay(sc), column_sample="orig.ident", column_cluster="seurat_clusters", column_ccphase="Phase", gene_lists=gene_lists_for_cerebro, marker_genes=sc_markers_filt, enriched_pathways=enriched)All files generated with this report are written into the provided output folder test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/:
The following parameters were used to run the workflow.
out = scrnaseq_params_info(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| project_id | pbmc |
| path_data | name:pbmc_10x, pbmc_smartseq2; type:10x, smartseq2; path:test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/, test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz |
| path_out | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/ |
| file_known_markers | test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/known_markers.xlsx |
| mart_dataset | hsapiens_gene_ensembl |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=external_gene_name |
| file_annot | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//hsapiens_gene_ensembl.v98.annot.txt |
| mart_attributes | ensembl_gene_id, external_gene_name, external_gene_name, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| mt | ^MT- |
| cell_filter | pbmc_10x:nFeature_RNA=c(200, NA), percent_mt=c(NA, 20); pbmc_smartseq2.sample1:nFeature_RNA=c(200, NA), percent_mt=c(NA, 20) |
| feature_filter | pbmc_10x:min_counts=1, min_cells=3; pbmc_smartseq2.sample1:min_counts=1, min_cells=3 |
| samples_to_drop | NC, RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| integrate_samples | method:standard; reference_dataset:1; dimensions:30 |
| normalisation_default | SCT |
| pc_n | 10 |
| cluster_resolution | 0.5 |
| padj | 0.05 |
| log2fc | 0.584962500721156 |
| p_enrichr | 0.05 |
| enrichr_dbs | GO_Molecular_Function_2018, GO_Biological_Process_2018, GO_Cellular_Component_2018 |
| col | palevioletred |
| col_palette_samples | ggsci::pal_jama |
| col_palette_clusters | ggsci::pal_startrek |
| sample_cells | 500 |
| path_to_git | . |
| col_samples | pbmc_10x=#374E55FF, pbmc_smartseq2.sample1=#DF8F44FF |
| col_clusters | 1=#CC0C00FF, 2=#5C88DAFF, 3=#84BD00FF, 4=#FFCD00FF, 5=#7C878EFF, 6=#00B5E2FF, 7=#00AF66FF, 8=#CC0C00B2 |
This report was generated using the scrnaseq GitHub repository. Software versions were collected at run time.
out = scrnaseq_session_info(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Name | Version |
|---|---|
| ktrns/scrnaseq | 94536c83158fd9a9cc989fc27639fcd95d300385 |
| R | R version 3.6.1 (2019-07-05) |
| Platform | x86_64-apple-darwin15.6.0 (64-bit) |
| Operating system | macOS Mojave 10.14.6 |
| Packages | abind1.4-5, annotate1.62.0, AnnotationDbi1.46.1, ape5.3, assertthat0.2.1, backports1.1.5, bibtex0.4.2, Biobase2.44.0, BiocGenerics0.30.0, BiocManager1.30.10, BiocParallel1.18.1, bit1.1-14, bit640.9-7, bitops1.0-6, blme1.0-4, blob1.2.0, boot1.3-23, callr3.4.3, caTools1.17.1.3, cerebroApp1.2.2, cli2.0.2, cluster2.1.0, codetools0.2-16, colorspace1.4-1, colourpicker1.0, cowplot1.0.0, crayon1.3.4, curl4.3, data.table1.12.6, DBI1.0.0, DelayedArray0.10.0, desc1.2.0, devtools2.3.0, digest0.6.25, dplyr0.8.3, DT0.13, ellipsis0.3.0, enrichR2.1, evaluate0.14, fansi0.4.0, farver2.0.1, fastmap1.0.1, fitdistrplus1.0-14, formattable0.2.0.1, fs1.4.1, future1.15.1, future.apply1.3.0, gdata2.18.0, geneplotter1.62.0, GenomeInfoDb1.20.0, GenomeInfoDbData1.2.1, GenomicRanges1.36.1, ggplot23.2.1, ggrepel0.8.1, ggridges0.5.1, ggsci2.9, ggtree1.16.6, globals0.12.5, glue1.4.1, gplots3.0.1.1, graph1.62.0, gridExtra2.3, GSEABase1.46.0, GSVA1.32.0, gtable0.3.0, gtools3.8.1, highr0.8, hms0.5.2, htmltools0.4.0, htmlwidgets1.5.1, httpuv1.5.2, httr1.4.1, ica1.0-2, igraph1.2.4.2, IRanges2.18.3, irlba2.3.3, jsonlite1.6.1, kableExtra1.1.0, KernSmooth2.23-16, knitcitations1.0.10, knitr1.26, labeling0.3, later1.0.0, lattice0.20-38, lazyeval0.2.2, leiden0.3.1, lifecycle0.1.0, listenv0.8.0, lme41.1-21, lmtest0.9-37, lsei1.2-0, lubridate1.7.4, magrittr1.5, MASS7.3-51.4, Matrix1.2-18, matrixStats0.55.0, memoise1.1.0, mime0.7, miniUI0.1.1.1, minqa1.2.4, msigdbr7.1.1, munsell0.5.0, nlme3.1-142, nloptr1.2.1, npsurv0.4-0, openxlsx4.1.4, patchwork1.0.0, pbapply1.4-2, pillar1.4.2, pkgbuild1.0.8, pkgconfig2.0.3, pkgload1.0.2, plotly4.9.1, plyr1.8.4, png0.1-7, prettyunits1.0.2, processx3.4.2, progress1.2.2, promises1.1.0, ps1.3.2, purrr0.3.3, qvalue2.16.0, R.methodsS31.7.1, R.oo1.23.0, R.utils2.9.2, R62.4.1, RANN2.6.1, RColorBrewer1.1-2, Rcpp1.0.3, RcppAnnoy0.0.14, RcppParallel4.4.4, RCurl1.95-4.12, readr1.3.1, RefManageR1.2.12, remotes2.1.1, reshape21.4.3, reticulate1.13, rjson0.2.20, rlang0.4.6, rmarkdown1.18, ROCR1.0-7, rprojroot1.3-2, RSpectra0.16-0, RSQLite2.1.4, rstudioapi0.11, rsvd1.0.2, Rtsne0.15, rvcheck0.1.7, rvest0.3.5, S4Vectors0.22.1, scales1.1.0, sctransform0.2.0, sessioninfo1.1.1, Seurat3.1.5, shiny1.4.0, shinydashboard0.7.1, shinythemes1.1.2, shinyWidgets0.5.3, SingleCellExperiment1.6.0, stringi1.4.3, stringr1.4.0, SummarizedExperiment1.14.1, survival3.1-8, testthat2.3.2, tibble2.1.3, tidyr1.0.0, tidyselect0.2.5, tidytree0.3.3, treeio1.8.2, tsne0.1-3, usethis1.6.1, uwot0.1.5, vctrs0.2.0, viridis0.5.1, viridisLite0.3.0, webshot0.5.2, withr2.1.2, xfun0.11, XML3.98-1.20, xml21.2.2, xtable1.8-4, XVector0.24.0, yaml2.2.0, zeallot0.1.0, zip2.0.4, zlibbioc1.30.0, zoo1.8-6 |
Chen, Edward Y. 2020. “Enrichr.” . https://amp.pharm.mssm.edu/Enrichr/.
Gandolfo, Luke C., and Terence P. Speed. 2018. “RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2). Public Library of Science (PLoS): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and Variance Stabilization of Single-Cell RNA-Seq Data Using Regularized Negative Binomial Regression.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1874-1.
Liu, Fenglin, Yuanyuan Zhang, Lei Zhang, Ziyi Li, Qiao Fang, Ranran Gao, and Zemin Zhang. 2019. “Systematic Comparative Analysis of Single-Nucleotide Variant Detection Methods from Single-Cell RNA Sequencing Data.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1863-4.
romanhaa. 2020. “Romanhaa/cerebroApp.” GitHub. https://github.com/romanhaa/cerebroApp. https://github.com/romanhaa/cerebroApp.
“Satija Lab.” 2020. http://www.satijalab.org/seurat/v3.1/cell_cycle_vignette.html. https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html.
Tirosh, I., B. Izar, S. M. Prakadan, M. H. Wadsworth, D. Treacy, J. J. Trombetta, A. Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282). American Association for the Advancement of Science (AAAS): 189–96. https://doi.org/10.1126/science.aad0501.
“Understanding Umap.” 2019. https://pair-code.github.io/understanding-umap/. https://pair-code.github.io/understanding-umap/.